knitr::opts_chunk$set(message = FALSE)
library(bslib)
library(dplyr)
library(ggplot2)
library(glue)
library(here)
library(lubridate)
library(plotly)
library(purrr)
library(readr)
library(rlang)
library(stringr)
library(tidyr)
theme_set(theme_bw())

input_dir <- params$input_dir # here("data")
aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
# TODO: only load last N weeks of data to keep RAM usage reasonably low
user_dat <- tibble(filename = list.dirs(input_dir) %>%
  Filter(function(x) {
    x != input_dir
  }, .) %>%
  lapply(function(x) {
    list.files(x, full.names = TRUE)
  }) %>%
  unlist()) %>%
  filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("date", "path", "username", "file", "ext"),
    too_few = "debug"
  ) %>%
  filter(
    str_detect(ext, "tsv|txt"), # only keep tab-delimited files
    !str_detect(username, "[0-9]"), # filter out numeric usernames
    username != "allusers"
  ) %>% # filter out the 'allusers' rows
  mutate(date = as_date(basename(date)))
## Warning: Debug mode activated: adding variables `filename_ok`,
## `filename_pieces`, and `filename_remainder`.
dates <- user_dat %>%
  pull(date) %>%
  unique()
most_recent_date <- dates %>% max()
usernames <- user_dat %>%
  pull(username) %>%
  unique()

users_filter <- c("sovacoolkl", "kopardevn") # TODO optionally select certain users

user_dat %>% write_tsv(here("results", glue("user-dat_{today()}.tsv")))

Most recent summary (2023-10-09)

Disk usage in /data/CCBR on Biowulf

summary_dat_recent <- user_dat %>%
  filter( # username %in% users_filter,
    date == most_recent_date, file == "summary"
  ) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x) %>% mutate(filename = x)
  }) %>%
  list_rbind() %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("basepath", "path", "username", "file", "ext")
  ) %>%
  filter(FolderPath == "/data/CCBR")

summary_dat_recent %>% write_tsv(here("results", glue("summary-dat-recent_{today()}.tsv")))

summary_metrics <- summary_dat_recent %>%
  pivot_longer(where(is.numeric), names_to = "metric") %>%
  pull(metric) %>%
  unique()

top_users <- summary_dat_recent %>%
  pivot_longer(all_of(summary_metrics),
    names_to = "metric"
  ) %>%
  mutate(value_adj = case_when(
    str_detect(metric, "[sS]core") ~ -value,
    TRUE ~ value
  )) %>%
  group_by(metric) %>%
  slice_max(order_by = value_adj, n = 10) %>%
  pull(username) %>%
  unique()

plots <- summary_metrics %>% lapply(function(y_metric) {
  user_order <- summary_dat_recent %>%
    filter(username %in% top_users) %>%
    pivot_longer(where(is.numeric),
      names_to = "metric"
    ) %>%
    mutate(value_adj = case_when(
      str_detect(metric, "[sS]core") ~ -value,
      TRUE ~ value
    )) %>%
    filter(metric == y_metric) %>%
    arrange(by = value_adj) %>%
    pull(username)
  p <- summary_dat_recent %>%
    filter(username %in% top_users) %>%
    mutate(username = factor(username, levels = user_order)) %>%
    ggplot(aes(
      x = eval_tidy(data_sym(y_metric)),
      y = username,
      fill = eval_tidy(data_sym(y_metric)),
      text = glue("{username}\n{y_metric}\n{FolderPath}")
    )) +
    geom_col() +
    labs(x = y_metric, y = "") +
    theme(legend.position = "none")
  nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
})
do.call(navset_pill_list, plots)
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore

Summary over time

summary_dat_all <- user_dat %>%
  filter( # username %in% users_filter,
    file == "summary"
  ) %>%
  pull(filename) %>%
  map(function(x) {
    read_tsv(x) %>% mutate(filename = x)
  }) %>%
  list_rbind() %>%
  separate_wider_delim(filename,
    delim = ".", cols_remove = FALSE,
    names = c("basepath", "path", "username", "file", "ext")
  ) %>%
  mutate(date = str_replace(basepath, ".*/", "") %>% as_date()) %>%
  filter(FolderPath == "/data/CCBR") # TODO: repeat for /data/CCBR_Pipeliner

summary_dat_all %>% write_tsv(here("results", glue("summary-dat-all_{today()}.tsv")))

top_users <- summary_dat_all %>%
  pivot_longer(all_of(summary_metrics),
    names_to = "metric"
  ) %>%
  mutate(value_adj = case_when(
    str_detect(metric, "[sS]core") ~ -value,
    TRUE ~ value
  )) %>%
  group_by(metric) %>%
  slice_max(order_by = value_adj, n = 10) %>%
  pull(username) %>%
  unique()

plots <- summary_metrics %>% lapply(function(y_metric) {
  user_order <- summary_dat_all %>%
    filter(username %in% top_users) %>%
    pivot_longer(all_of(summary_metrics),
      names_to = "metric"
    ) %>%
    mutate(value_adj = case_when(
      str_detect(metric, "[sS]core") ~ -value,
      TRUE ~ value
    )) %>%
    filter(metric == y_metric) %>%
    arrange(by = value_adj) %>%
    pull(username)
  p <- summary_dat_all %>%
    filter(username %in% user_order) %>%
    ggplot(aes(date, eval_tidy(data_sym(y_metric)),
      color = username,
      text = glue("{username}\n{y_metric}\n{FolderPath}\n{date}")
    )) +
    geom_line(alpha = 0.7) +
    geom_point() +
    labs(y = y_metric)
  nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
})
do.call(navset_pill_list, plots)
TotalBytes
DuplicateBytes
PercentDuplicateBytes
TotalFiles
DuplicateFiles
PercentDuplicateFiles
TotalMeanAge
DuplicateMeanAge
AgeScore
DupScore
OccScore
OverallScore